template <unsigned BSize, typename U, typename V, typename W>
void my_axpy(U& u, const V& v, const W& w){
assert(u.size()==v.size() && v.size()==w.size());
unsigned s=u.size(), sb=s/BSize*BSize;
for(unsigned i=0; i<sub; i+=BSize) my_axpy_ftor<0, BSize>()(u, v, w, i);
for(unsigned i=sb; i<s; ++i) u[i]=3.0f*v[i]+w[i];
}
template <unsigned Offset, unsigned Max>
struct my_axpr_ftor{
template <typename U, typename V, typename W>
void operator()(U& u, const V& v, const W& w, unsigned i){
u[i+Offset]=3.0f*v[i+Offset]+w[i+Offset];
my_axpy_ftor<Offset+1, Max>()(u, v, w, i);
}
};
template <unsigned Max>
struct my_axpr_ftor<Max, Max>{
template <typename U, typename V, typename W>
void operator()(U& u, const V& v, const W& w, unsigned i) {}
};